#2017 Analysis:
library(ggplot2)
library(corrplot)
## corrplot 0.92 loaded
library(magrittr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
nyc_df_17<- read.csv('/Users/nigel/Downloads/nyc_clean.csv',header = TRUE)
nyc_df_17 <- nyc_df_17 %>% filter(Year == 2017)
# Top 10 Counties with highest Percent of Children with Elevated Blood Levels
nyc_df_17_top_10 <- nyc_df_17[order(-nyc_df_17$Total.Elevated.Blood.Levels), ]
top_10_counties <- head(nyc_df_17_top_10, 10)
library(ggplot2)
ggplot(top_10_counties, aes(x = reorder(County, Total.Elevated.Blood.Levels), y = Total.Elevated.Blood.Levels, fill = County)) +
geom_bar(stat = "identity") +
geom_text(aes(label = scales::percent(Total.Elevated.Blood.Levels/sum(Total.Elevated.Blood.Levels))), position = position_stack(vjust = 0.5)) +
labs(title = "Top 10 Counties with Highest Percent of Children with EBLLs", x = "County", y = "Number of Children with EBLLs") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Bottom 10 Counties (Not used just for reference)
nyc_df_17_bottom_10 <- nyc_df_17[order(nyc_df_17$Total.Elevated.Blood.Levels), ]
bottom_10_counties <- head(nyc_df_17_bottom_10, 10)
ggplot(bottom_10_counties, aes(x = reorder(County, desc(Total.Elevated.Blood.Levels)), y = Total.Elevated.Blood.Levels, fill = County)) +
geom_bar(stat = "identity", width = 0.5) +
labs(title = "Bottom 10 Counties with Lowest Number of Children with EBLLs", x = "County", y = "Number of Children with EBLLs") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_text(aes(label = Total.Elevated.Blood.Levels), vjust = -0.5, size = 3, color = "black")
#Median Income Comparison between Counties with Lowest and Highest EBLLs
income_county_data <- rbind(
data.frame(Group = "Lowest EBLL", County = factor(bottom_10_counties$County, levels = unique(bottom_10_counties$County)), Per.Capita.Income = bottom_10_counties$Per.Capita.Income),
data.frame(Group = "Highest EBLL", County = factor(top_10_counties$County, levels = unique(top_10_counties$County)), Per.Capita.Income = top_10_counties$Per.Capita.Income)
)
# Bar graph (Not Recommended)
ggplot(income_county_data, aes(x = County, y = Per.Capita.Income, fill = Group)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Median Income Comparison between Counties with Lowest and Highest EBLLs",
x = "County", y = "Median Income") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Scatter Plot(Use this)
ggplot(income_county_data, aes(x = County, y = Per.Capita.Income, color = Group)) +
geom_point(size = 4, alpha = 0.7, position = position_jitter(width = 0.3)) +
geom_smooth(method = "lm", se = FALSE, linetype = "dashed", color = "black") +
labs(title = "Median Income Comparison between Counties with Lowest and Highest EBLLs",
x = "County", y = "Median Income") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "top") +
scale_color_manual(values = c("Lowest EBLL" = "#4e79a7", "Highest EBLL" = "#e15759")) +
scale_fill_manual(values = c("Lowest EBLL" = "#4e79a7", "Highest EBLL" = "#e15759"))
## `geom_smooth()` using formula = 'y ~ x'
# Plot the data points and the regression line
ggplot(nyc_df_17, aes(x = Total.Elevated.Blood.Levels, y = Per.Capita.Income)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Linear Regression",
x = "Number of Children with EBLLs",
y = "Income 2019")
## `geom_smooth()` using formula = 'y ~ x'
model <- lm(Per.Capita.Income ~ Total.Elevated.Blood.Levels, data = nyc_df_17)
# Plot the data points and the regression line
ggplot(nyc_df_17, aes(x = Total.Elevated.Blood.Levels, y = Per.Capita.Income)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Linear Regression",
x = "Number of Children with EBLLs",
y = "Income 2017")
## `geom_smooth()` using formula = 'y ~ x'
# KMeans Clustering for EBLL and Income
set.seed(123)
data_for_clustering <- nyc_df_17[, c("Total.Elevated.Blood.Levels", "Per.Capita.Income")]
scaled_data <- scale(data_for_clustering)
k <- 3
kmeans_model <- kmeans(scaled_data, centers = k)
nyc_df_17_copy<- nyc_df_17
nyc_df_17_copy$cluster <- as.factor(kmeans_model$cluster)
ggplot(nyc_df_17_copy, aes(x = Total.Elevated.Blood.Levels, y = Per.Capita.Income, color = cluster)) +
geom_point() +
labs(title = "K-Means Clustering",
x = "Number of Children with EBLLs",
y = " Median Income ") +
scale_color_discrete(name = "Cluster")
#rate of elev vs percent change
# Children Tested vs Income and rate of elevation
custom_theme <- theme_minimal() +
theme(
plot.title = element_text(size = 20, hjust = 0.5, face = "bold", color = "darkblue"),
axis.title = element_text(size = 16, face = "bold"),
axis.text = element_text(size = 14),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)
)
ggplot(nyc_df_17, aes(x = Tests, y = Per.Capita.Income, size = Tests, color = Rate.per.1.000)) +
geom_point(alpha = 0.7) +
scale_color_gradient(low = "blue", high = "orange") +
scale_size_continuous(range = c(3, 15)) +
labs(title = "Children Tested vs Income",
x = "Number of Children Tested",
y = "Income",
size = "Number of Children Tested",
color = "Rate of Elevation in EBLLs") +
custom_theme
# Children Tested vs Income Plot Zoomed in on Clustered Area(Axis Changed)
custom_theme <- theme_minimal() +
theme(
plot.title = element_text(size = 20, hjust = 0.5, face = "bold", color = "darkblue"),
axis.title = element_text(size = 16, face = "bold"),
axis.text = element_text(size = 14),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)
)
ggplot(nyc_df_17, aes(x = Tests, y = Per.Capita.Income, size = Tests, color = Rate.per.1.000)) +
geom_point(alpha = 0.7) +
scale_color_gradient(low = "blue", high = "orange") +
scale_size_continuous(range = c(3, 15)) +
labs(title = "Children Tested vs Income",
x = "Number of Children Tested",
y = "Income",
size = "Number of Children Tested",
color = "Rate of Elevation in EBLLs") +
custom_theme +
coord_cartesian(xlim = c(0, 5000), ylim = c(26000, 85000))
#2018 Analysis:
library(ggplot2)
library(corrplot)
library(magrittr)
library(dplyr)
nyc_df_18<- read.csv('/Users/nigel/Downloads/nyc_clean.csv',header = TRUE)
nyc_df_18 <- nyc_df_18 %>% filter(Year == 2018)
# Top 10 Counties with highest Percent of Children with Elevated Blood Levels
nyc_df_18_top_10 <- nyc_df_18[order(-nyc_df_18$Total.Elevated.Blood.Levels), ]
top_10_counties <- head(nyc_df_18_top_10, 10)
library(ggplot2)
ggplot(top_10_counties, aes(x = reorder(County, Total.Elevated.Blood.Levels), y = Total.Elevated.Blood.Levels, fill = County)) +
geom_bar(stat = "identity") +
geom_text(aes(label = scales::percent(Total.Elevated.Blood.Levels/sum(Total.Elevated.Blood.Levels))), position = position_stack(vjust = 0.5)) +
labs(title = "Top 10 Counties with Highest Percent of Children with EBLLs", x = "County", y = "Number of Children with EBLLs") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Bottom 10 Counties (Not used just for reference)
nyc_df_18_bottom_10 <- nyc_df_18[order(nyc_df_18$Total.Elevated.Blood.Levels), ]
bottom_10_counties <- head(nyc_df_18_bottom_10, 10)
ggplot(bottom_10_counties, aes(x = reorder(County, desc(Total.Elevated.Blood.Levels)), y = Total.Elevated.Blood.Levels, fill = County)) +
geom_bar(stat = "identity", width = 0.5) +
labs(title = "Bottom 10 Counties with Lowest Number of Children with EBLLs", x = "County", y = "Number of Children with EBLLs") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_text(aes(label = Total.Elevated.Blood.Levels), vjust = -0.5, size = 3, color = "black")
#Median Income Comparison between Counties with Lowest and Highest EBLLs
income_county_data <- rbind(
data.frame(Group = "Lowest EBLL", County = factor(bottom_10_counties$County, levels = unique(bottom_10_counties$County)), Per.Capita.Income = bottom_10_counties$Per.Capita.Income),
data.frame(Group = "Highest EBLL", County = factor(top_10_counties$County, levels = unique(top_10_counties$County)), Per.Capita.Income = top_10_counties$Per.Capita.Income)
)
# Bar graph (Not Recommended)
ggplot(income_county_data, aes(x = County, y = Per.Capita.Income, fill = Group)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Median Income Comparison between Counties with Lowest and Highest EBLLs",
x = "County", y = "Median Income") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Scatter Plot(Use this)
ggplot(income_county_data, aes(x = County, y = Per.Capita.Income, color = Group)) +
geom_point(size = 4, alpha = 0.7, position = position_jitter(width = 0.3)) +
geom_smooth(method = "lm", se = FALSE, linetype = "dashed", color = "black") +
labs(title = "Median Income Comparison between Counties with Lowest and Highest EBLLs",
x = "County", y = "Median Income") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "top") +
scale_color_manual(values = c("Lowest EBLL" = "#4e79a7", "Highest EBLL" = "#e15759")) +
scale_fill_manual(values = c("Lowest EBLL" = "#4e79a7", "Highest EBLL" = "#e15759"))
## `geom_smooth()` using formula = 'y ~ x'
# Plot the data points and the regression line
ggplot(nyc_df_18, aes(x = Total.Elevated.Blood.Levels, y = Per.Capita.Income)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Linear Regression",
x = "Number of Children with EBLLs",
y = "Income 2019")
## `geom_smooth()` using formula = 'y ~ x'
model <- lm(Per.Capita.Income ~ Total.Elevated.Blood.Levels, data = nyc_df_18)
# Plot the data points and the regression line
ggplot(nyc_df_18, aes(x = Total.Elevated.Blood.Levels, y = Per.Capita.Income)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Linear Regression",
x = "Number of Children with EBLLs",
y = "Income 2018")
## `geom_smooth()` using formula = 'y ~ x'
# KMeans Clustering for EBLL and Income
set.seed(123)
data_for_clustering <- nyc_df_18[, c("Total.Elevated.Blood.Levels", "Per.Capita.Income")]
scaled_data <- scale(data_for_clustering)
k <- 3
kmeans_model <- kmeans(scaled_data, centers = k)
nyc_df_18_copy<- nyc_df_18
nyc_df_18_copy$cluster <- as.factor(kmeans_model$cluster)
ggplot(nyc_df_18_copy, aes(x = Total.Elevated.Blood.Levels, y = Per.Capita.Income, color = cluster)) +
geom_point() +
labs(title = "K-Means Clustering",
x = "Number of Children with EBLLs",
y = " Median Income ") +
scale_color_discrete(name = "Cluster")
#rate of elev vs percent change
# Children Tested vs Income and rate of elevation
custom_theme <- theme_minimal() +
theme(
plot.title = element_text(size = 20, hjust = 0.5, face = "bold", color = "darkblue"),
axis.title = element_text(size = 16, face = "bold"),
axis.text = element_text(size = 14),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)
)
ggplot(nyc_df_18, aes(x = Tests, y = Per.Capita.Income, size = Tests, color = Rate.per.1.000)) +
geom_point(alpha = 0.7) +
scale_color_gradient(low = "blue", high = "orange") +
scale_size_continuous(range = c(3, 15)) +
labs(title = "Children Tested vs Income",
x = "Number of Children Tested",
y = "Income",
size = "Number of Children Tested",
color = "Rate of Elevation in EBLLs") +
custom_theme
# Children Tested vs Income Plot Zoomed in on Clustered Area(Axis Changed)
custom_theme <- theme_minimal() +
theme(
plot.title = element_text(size = 20, hjust = 0.5, face = "bold", color = "darkblue"),
axis.title = element_text(size = 16, face = "bold"),
axis.text = element_text(size = 14),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)
)
ggplot(nyc_df_18, aes(x = Tests, y = Per.Capita.Income, size = Tests, color = Rate.per.1.000)) +
geom_point(alpha = 0.7) +
scale_color_gradient(low = "blue", high = "orange") +
scale_size_continuous(range = c(3, 15)) +
labs(title = "Children Tested vs Income",
x = "Number of Children Tested",
y = "Income",
size = "Number of Children Tested",
color = "Rate of Elevation in EBLLs") +
custom_theme +
coord_cartesian(xlim = c(0, 5000), ylim = c(26000, 85000))
#2019 Analysis:
library(ggplot2)
library(corrplot)
library(magrittr)
library(dplyr)
nyc_df<- read.csv('/Users/nigel/Downloads/nyc_clean.csv',header = TRUE)
nyc_df <- nyc_df %>% filter(Year == 2019)
# Top 10 Counties with highest Percent of Children with Elevated Blood Levels
nyc_df_top_10 <- nyc_df[order(-nyc_df$Total.Elevated.Blood.Levels), ]
top_10_counties <- head(nyc_df_top_10, 10)
library(ggplot2)
ggplot(top_10_counties, aes(x = reorder(County, Total.Elevated.Blood.Levels), y = Total.Elevated.Blood.Levels, fill = County)) +
geom_bar(stat = "identity") +
geom_text(aes(label = scales::percent(Total.Elevated.Blood.Levels/sum(Total.Elevated.Blood.Levels))), position = position_stack(vjust = 0.5)) +
labs(title = "Top 10 Counties with Highest Percent of Children with EBLLs", x = "County", y = "Number of Children with EBLLs") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Bottom 10 Counties (Not used just for reference)
nyc_df_bottom_10 <- nyc_df[order(nyc_df$Total.Elevated.Blood.Levels), ]
bottom_10_counties <- head(nyc_df_bottom_10, 10)
ggplot(bottom_10_counties, aes(x = reorder(County, desc(Total.Elevated.Blood.Levels)), y = Total.Elevated.Blood.Levels, fill = County)) +
geom_bar(stat = "identity", width = 0.5) +
labs(title = "Bottom 10 Counties with Lowest Number of Children with EBLLs", x = "County", y = "Number of Children with EBLLs") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_text(aes(label = Total.Elevated.Blood.Levels), vjust = -0.5, size = 3, color = "black")
#Median Income Comparison between Counties with Lowest and Highest EBLLs
income_county_data <- rbind(
data.frame(Group = "Lowest EBLL", County = factor(bottom_10_counties$County, levels = unique(bottom_10_counties$County)), Per.Capita.Income = bottom_10_counties$Per.Capita.Income),
data.frame(Group = "Highest EBLL", County = factor(top_10_counties$County, levels = unique(top_10_counties$County)), Per.Capita.Income = top_10_counties$Per.Capita.Income)
)
# Bar graph (Not Recommended)
ggplot(income_county_data, aes(x = County, y = Per.Capita.Income, fill = Group)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Median Income Comparison between Counties with Lowest and Highest EBLLs",
x = "County", y = "Median Income") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Scatter Plot(Use this)
ggplot(income_county_data, aes(x = County, y = Per.Capita.Income, color = Group)) +
geom_point(size = 4, alpha = 0.7, position = position_jitter(width = 0.3)) +
geom_smooth(method = "lm", se = FALSE, linetype = "dashed", color = "black") +
labs(title = "Median Income Comparison between Counties with Lowest and Highest EBLLs",
x = "County", y = "Median Income") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "top") +
scale_color_manual(values = c("Lowest EBLL" = "#4e79a7", "Highest EBLL" = "#e15759")) +
scale_fill_manual(values = c("Lowest EBLL" = "#4e79a7", "Highest EBLL" = "#e15759"))
## `geom_smooth()` using formula = 'y ~ x'
# Plot the data points and the regression line
ggplot(nyc_df, aes(x = Total.Elevated.Blood.Levels, y = Per.Capita.Income)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Linear Regression",
x = "Number of Children with EBLLs",
y = "Income 2019")
## `geom_smooth()` using formula = 'y ~ x'
model <- lm(Per.Capita.Income ~ Total.Elevated.Blood.Levels, data = nyc_df)
# Plot the data points and the regression line
ggplot(nyc_df, aes(x = Total.Elevated.Blood.Levels, y = Per.Capita.Income)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Linear Regression",
x = "Number of Children with EBLLs",
y = "Income 2019")
## `geom_smooth()` using formula = 'y ~ x'
# KMeans Clustering for EBLL and Income
set.seed(123)
data_for_clustering <- nyc_df[, c("Total.Elevated.Blood.Levels", "Per.Capita.Income")]
scaled_data <- scale(data_for_clustering)
k <- 3
kmeans_model <- kmeans(scaled_data, centers = k)
nyc_df_copy<- nyc_df
nyc_df_copy$cluster <- as.factor(kmeans_model$cluster)
ggplot(nyc_df_copy, aes(x = Total.Elevated.Blood.Levels, y = Per.Capita.Income, color = cluster)) +
geom_point() +
labs(title = "K-Means Clustering",
x = "Number of Children with EBLLs",
y = " Median Income ") +
scale_color_discrete(name = "Cluster")
#rate of elev vs percent change
# Children Tested vs Income and rate of elevation
custom_theme <- theme_minimal() +
theme(
plot.title = element_text(size = 20, hjust = 0.5, face = "bold", color = "darkblue"),
axis.title = element_text(size = 16, face = "bold"),
axis.text = element_text(size = 14),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)
)
ggplot(nyc_df, aes(x = Tests, y = Per.Capita.Income, size = Tests, color = Rate.per.1.000)) +
geom_point(alpha = 0.7) +
scale_color_gradient(low = "blue", high = "orange") +
scale_size_continuous(range = c(3, 15)) +
labs(title = "Children Tested vs Income",
x = "Number of Children Tested",
y = "Income",
size = "Number of Children Tested",
color = "Rate of Elevation in EBLLs") +
custom_theme
# Children Tested vs Income Plot Zoomed in on Clustered Area(Axis Changed)
custom_theme <- theme_minimal() +
theme(
plot.title = element_text(size = 20, hjust = 0.5, face = "bold", color = "darkblue"),
axis.title = element_text(size = 16, face = "bold"),
axis.text = element_text(size = 14),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)
)
ggplot(nyc_df, aes(x = Tests, y = Per.Capita.Income, size = Tests, color = Rate.per.1.000)) +
geom_point(alpha = 0.7) +
scale_color_gradient(low = "blue", high = "orange") +
scale_size_continuous(range = c(3, 15)) +
labs(title = "Children Tested vs Income",
x = "Number of Children Tested",
y = "Income",
size = "Number of Children Tested",
color = "Rate of Elevation in EBLLs") +
custom_theme +
coord_cartesian(xlim = c(0, 5000), ylim = c(26000, 85000))
#install.packages("leaflet")
library(dplyr)
library(ggplot2)
library(corrplot)
#library(leaflet)
# Assuming your dataframe is named 'your_dataframe'
# Replace this with your actual dataframe
your_dataframe <- read.csv('/Users/nigel/Downloads/nyc_clean.csv')
# 1. Summary Statistics
summary(your_dataframe)
## County County.Code Year Tests
## Length:265 Min. : 1.0 Min. :2016 Min. : 6.0
## Class :character 1st Qu.: 29.0 1st Qu.:2017 1st Qu.: 71.0
## Mode :character Median : 59.0 Median :2018 Median : 142.0
## Mean : 62.7 Mean :2018 Mean : 484.7
## 3rd Qu.: 97.0 3rd Qu.:2019 3rd Qu.: 411.0
## Max. :123.0 Max. :2020 Max. :6291.0
##
## Less.than.5.mcg.dL X5.10.mcg.dL X10...15.mcg.dL X15...mcg.dL
## Min. : 6 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 71 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 141 Median : 0.00 Median : 0.000 Median : 0.000
## Mean : 465 Mean : 14.53 Mean : 2.691 Mean : 2.525
## 3rd Qu.: 411 3rd Qu.: 7.00 3rd Qu.: 0.000 3rd Qu.: 0.000
## Max. :6273 Max. :623.00 Max. :105.000 Max. :87.000
##
## Total.Elevated.Blood.Levels Percent Rate.per.1.000
## Min. : 0.00 Min. :0.00000 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.:0.00000 1st Qu.: 0.00
## Median : 0.00 Median :0.00000 Median : 0.00
## Mean : 19.74 Mean :0.01693 Mean : 13.28
## 3rd Qu.: 7.00 3rd Qu.:0.01800 3rd Qu.: 0.00
## Max. :814.00 Max. :0.24500 Max. :508.00
##
## Per.Capita.Income Rank.in.State
## Min. : 34116 Min. : 1
## 1st Qu.: 41896 1st Qu.:14
## Median : 45120 Median :27
## Mean : 49047 Mean :27
## 3rd Qu.: 51558 3rd Qu.:40
## Max. :113477 Max. :53
## NA's :106 NA's :106
ggplot(your_dataframe, aes(x = County, y = Tests, fill = County)) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Explore the time series data
ggplot(your_dataframe, aes(x = Year, y = Total.Elevated.Blood.Levels)) +
geom_line() +
labs(title = "Lead Levels Over Time")
# Assuming your dataset is named 'your_data'
# Replace 'your_data' with your actual dataset name
# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract() masks magrittr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::set_names() masks magrittr::set_names()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tsibble)
##
## Attaching package: 'tsibble'
##
## The following object is masked from 'package:lubridate':
##
## interval
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
your_data <- read.csv('/Users/nigel/Downloads/nyc_clean.csv')
# Convert the data to a tsibble format
your_data_ts <- your_data %>%
as_tsibble(key = County, index = Year) %>%
arrange(County, Year) # Make sure data is sorted by County and Year
# Explore the time series data (plot as kernel density plot)
ggplot(your_data_ts, aes(x = Total.Elevated.Blood.Levels)) +
geom_density(fill = "blue", alpha = 0.5) +
labs(title = "Kernel Density of Lead Levels Over Time",
x = "Lead Levels (mcg/dL)",
y = "Density")+xlim(c(0,50))
## Warning: Removed 24 rows containing non-finite values (`stat_density()`).
#Start of Timeseries Code
library(tidyverse)
library(tsibble)
library(forecast)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
ts=read.csv('/Users/nigel/Downloads/nyc_clean.csv')
ts <- ts %>%
filter(Year >= 2017 & Year <= 2019)
ts$Year<-as.factor(ts$Year)
# Top 10 EBLL Counties
top_counties <- ts %>%
group_by(County) %>%
summarise(total_eblls = sum(Total.Elevated.Blood.Levels)) %>%
arrange(desc(total_eblls)) %>%
slice(1:10) %>%
pull(County)
# Filter Top 10 EBLL Counties
df_top_10 <- ts %>%
filter(County %in% top_counties)
# Plotly plot for Top 10 based on EBLL and Year
plot_ly(df_top_10, x = ~Year, y = ~Total.Elevated.Blood.Levels, color = ~County, type = "scatter", mode = "lines") %>%
layout(title = "Top 10 Counties with Highest levels of EBLL",
xaxis = list(title = "Year"),
yaxis = list(title = "Number of Children with EBLLs"))
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
# Plotly plot for Top 10 Counties with highest EBLL based on Income and Year
plot_ly(df_top_10, x = ~Year, y = ~Per.Capita.Income, color = ~County, type = "scatter", mode = "lines") %>%
layout(title = "Median Income of Top 10 Counties with highest EBLL",
xaxis = list(title = "Year"),
yaxis = list(title = "Median Income"))
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
# Combining both
plot_ly() %>%
add_trace(data = df_top_10, x = ~Year, y = ~Total.Elevated.Blood.Levels, color = ~County,
type = "scatter", mode = "lines", name = "Number of Children with EBLLs",
hoverinfo = "text", text = ~paste("County: ", County, "<br>Year: ", Year, "<br>EBLLs: ", Total.Elevated.Blood.Levels)) %>%
add_trace(data = df_top_10, x = ~Year, y = ~Per.Capita.Income, color = ~County,
type = "scatter", mode = "lines", name = "Median Income",
yaxis = "y2", hoverinfo = "text", text = ~paste("County: ", County, "<br>Year: ", Year, "<br>Income: ", Per.Capita.Income)) %>%
layout(title = "Top 10 Counties with Highest Levels of EBLL and their Median Income",
xaxis = list(title = "Year"),
yaxis = list(title = "Number of Children with EBLLs", side = "left"),
yaxis2 = list(title = "Median Income", side = "right", overlaying = "y", showgrid = FALSE),
showlegend = FALSE,
hovermode = "closest")
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
# Top 10 Elevation Rate Counties
top_elevation_counties <- ts %>%
group_by(County) %>%
summarise(total_elevation = sum(Rate.per.1.000)) %>%
arrange(desc(total_elevation)) %>%
slice(1:10)
# Filter Top 10 Elevation Rate Counties
df_top_10_elevation <- ts %>%
filter(County %in% top_elevation_counties$County)
plot_ly(df_top_10_elevation, x = ~Year, y = ~Rate.per.1.000, color = ~County, type = "scatter", mode = "lines") %>%
layout(title = "Top 10 Counties with highest Rate of Elevation",
xaxis = list(title = "Year"),
yaxis = list(title = "Rate Of Elevation"))
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
# Forecast Blood Levels in 2020
ts_subset <- ts[ts$Year %in% c(2017, 2018, 2019), ]
counties <- unique(ts_subset$County)
forecast_df <- data.frame(County = character(), Forecasted_EBLLs_2020 = numeric(), stringsAsFactors = FALSE)
for (county in counties) {
county_data <- ts_subset$Total.Elevated.Blood.Levels[ts_subset$County == county]
arima_model <- auto.arima(county_data)
forecast_result <- forecast(arima_model, h = 1)
forecast_value <- forecast_result$mean[1]
forecast_df <- rbind(forecast_df, data.frame(County = county, Forecasted_EBLLs_2020 = forecast_value))
}
sorted_forecast_df <- forecast_df[order(-forecast_df$Forecasted_EBLLs_2020), ]
top_10_counties_forecast <- head(sorted_forecast_df, 10)
library(ggplot2)
# Create a bar plot for the top 10 counties forecast
ggplot(top_10_counties_forecast, aes(x = reorder(County, -Forecasted_EBLLs_2020), y = Forecasted_EBLLs_2020, fill = County)) +
geom_bar(stat = "identity") +
geom_text(aes(label = scales::percent(Forecasted_EBLLs_2020/sum(Forecasted_EBLLs_2020))), position = position_stack(vjust = 0.5)) +
labs(title = "Top 10 Counties with Highest Forecasted Blood Levels in 2020", x = "County", y = "Forecasted Blood Levels") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))